home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyo (Python 2.5)
-
- from __future__ import with_statement
- from callbacks import callsback
- from threads import threaded
- from threads.timeout_thread import Timer
- from net import build_opener, build_cookie
- import re
- import StringIO
- import cookielib
- import urllib2
- import logging
- import lxml.etree as ET
- import lxml.html as HTML
- import operator
- from contextlib import closing
- itemgetter0 = operator.itemgetter(0)
- log = logging.getLogger('httptools')
-
- class RequestOpener(object):
- retries = 3
- pause_for_attempts = 1
- js_redirect_res = ((re.compile('window.location.replace\\("(.*?)"\\);'), 1),)
-
- def __init__(self, opener, request, data = None, **kwds):
- self.openfunc = getattr(opener, 'open', opener)
- retries = kwds.pop('retries', None)
- if retries is not None:
- self.retries = retries
-
- if isinstance(request, basestring):
- request = urllib2.Request.make_request(request, data, **kwds)
-
- self.request = request
- self._sub_requester = None
- self.callback = None
-
-
- def open(self, callback = None):
- if self.callback is not None:
- raise Exception('Request already in progress')
-
- self.callback = callback
- self._attempt_open()
-
- open = callsback(open)
-
- def _attempt_open(self):
- self.openfunc(self.request, success = self._check_success, error = self._check_error)
-
-
- def preprocess_response(self, resp):
- closing(resp).__enter__()
-
- try:
- data = resp.read()
- finally:
- pass
-
- sio = StringIO.StringIO(data)
- for attr in ('read', 'seek', 'close', 'tell'):
- setattr(resp, attr, getattr(sio, attr))
-
- resp._stringio = sio
- resp.content = data
- return resp
-
-
- def _check_success(self, resp):
- resp = self.preprocess_response(resp)
-
- try:
- self.redirect(resp)
- except Exception:
- e = None
- error = self.check_resp_for_errors(resp)
- if error is None:
- self.finish('success', resp)
- else:
- self._on_error(error)
- except:
- error is None
-
-
-
- def _redirect_success(self, resp):
- self._sub_requester = None
- self.finish('success', resp)
-
-
- def _redirect_error(self, err = None):
- self._sub_requester = None
- self._on_error(err)
-
-
- def redirect(self, resp):
- if self._sub_requester is not None:
- raise Exception('Redirect already in progress')
-
- redirect = self.make_redirect_request(resp)
- new = self._sub_requester = type(self)(self.openfunc, redirect)
- new.open(success = self._redirect_success, error = self._redirect_error)
-
-
- def make_redirect_request(self, resp):
- for redirecter in (self._find_http_redirect, self._find_js_redirect):
- redirect = redirecter(resp)
- if redirect is not None:
- if not redirect.startswith('http'):
- if not redirect.startswith('/'):
- redirect = '/' + redirect
-
- redirect = self.request.get_type() + '://' + self.request.get_host() + redirect
-
- log.debug('got redirect: %r', redirect)
- return redirect
- continue
-
- raise Exception("Couldn't find URL for redirect in %r" % resp.content)
-
-
- def _find_http_redirect(self, resp):
- if resp.code in (301, 302):
- return resp.headers.get('Location', None)
-
-
-
- def _find_js_redirect(self, resp):
- for redirect_re, url_group_id in self.js_redirect_res:
- match = redirect_re.search(resp.content)
- if match:
- new_url = match.group(url_group_id)
- if new_url:
- return new_url
-
- new_url
-
-
-
- def check_resp_for_errors(self, resp):
- pass
-
-
- def _check_error(self, err = None):
- self._on_error(err)
-
-
- def _on_error(self, e = None):
- self.retries -= 1
- if self.retries:
- if self.pause_for_attempts > 0:
- Timer(self.pause_for_attempts, self._attempt_open).start()
- else:
- self._attempt_open()
- else:
- self.finish('error', e)
-
-
- def finish(self, result, *args):
- cb = self.callback
- self.callback = None
- self._sub_request = None
- getattr(cb, result, (lambda : pass))(*args)
-
-
-
- def dispatcher(what, arg_getter):
-
- def dispatch(self, *args):
- name = arg_getter(args)
- handler = getattr(self, '%s_%s' % (what, name), getattr(self, '%s_default' % what, None))
- if handler is not None:
- return handler(*args)
- else:
- log.error('No default handler for %r', what)
-
- return dispatch
-
-
- class WebScraper(object):
- CookieJarFactory = cookielib.CookieJar
- HttpOpenerFactory = staticmethod(build_opener)
- RequestFactory = staticmethod(urllib2.Request.make_request)
- domain = None
- urls = { }
-
- def __init__(self):
- self._waiting = set()
- self._callbacks = { }
- self.init_http()
-
-
- def init_http(self):
- self._jar = self.CookieJarFactory()
- self.http = self.HttpOpenerFactory(urllib2.HTTPCookieProcessor(self._jar))
-
-
- def get_cookie(self, key, default = sentinel, domain = None, path = '/'):
- if domain is None:
- domain = self.domain
-
- val = default
-
- try:
- self._jar._cookies_lock.__enter__()
-
- try:
- val = self._jar._cookies[domain][path][key].value
- finally:
- pass
-
- except (AttributeError, KeyError):
- e = None
- if val is sentinel:
- raise e
- else:
- return val
- except:
- val is sentinel
-
- return val
-
-
- def set_cookie(self, key, value, domain = None, path = '/'):
- if domain is None:
- domain = self.domain
-
- self._jar._cookies_lock.__enter__()
-
- try:
- domain_dict = self._jar._cookies.setdefault(domain, { })
- path_dict = domain_dict.setdefault(path, { })
- cookie = path_dict.get(key, None)
- if cookie is None:
- cookie = build_cookie(key, value, domain = domain, path = path)
- path_dict[key] = cookie
- else:
- cookie.value = value
- finally:
- pass
-
-
-
- def set_waiting(self, *things):
- self._waiting.update(things)
-
-
- def clear_waiting(self, *things):
- self._waiting -= set(things)
- if not self._waiting:
- self.done_waiting()
-
-
-
- def done_waiting(self):
- pass
-
-
- def request(self, name, callback = None):
- if name in self._waiting:
- log.warning('already waiting for %r', name)
- return None
-
- self._callbacks[name] = callback
- req = self.build_request(name)
- self.perform_request(name, req)
-
- request = callsback(request)
-
- def perform_request(self, name, req):
- self.set_waiting(name)
- if req is None:
- return self.error_handler(name)(Exception('No request created for %r' % name))
-
- reqopen = RequestOpener(threaded(self.http.open), req)
- reqopen.open(success = self.success_handler(name), error = self.error_handler(name))
-
-
- def error_handler(self, name):
-
- def handler(e = (None, None)):
- self.clear_waiting(name)
- cb = self._callbacks.pop(name, None)
- retval = self.handle_error(name, e)
- if cb is not None:
- cb.error(e)
-
- return retval
-
- return handler
-
-
- def success_handler(self, name):
-
- def handler(resp):
- self.clear_waiting(name)
- resp = self.preprocess_resp(name, resp)
- newresp = self.handle_success(name, resp)
- if newresp is not None:
- resp = newresp
-
- cb = self._callbacks.pop(name, None)
- if cb is not None:
- cb.success(resp)
-
- return newresp
-
- return handler
-
- build_request = dispatcher('build_request', itemgetter0)
- handle_error = dispatcher('handle_error', itemgetter0)
- preprocess_resp = dispatcher('preprocess_resp', itemgetter0)
- handle_success = dispatcher('handle_success', itemgetter0)
-
- def build_request_default(self, name):
- link = self.urls[name]
- if callable(link):
- link = link()
-
- return self.RequestFactory(link)
-
-
- def handle_error_default(self, name, e):
- log.error('Error requesting %r: %r', name, e)
-
-
- def handle_success_default(self, name, resp):
- if resp.document is not None:
- print HTML.tostring(resp.document, pretty_print = True)
- else:
- print 'Got None for lxml doc. code/status= %r' % ((resp.code, resp.msg, str(resp.headers)),)
-
-
- def preprocess_resp_default(self, name, resp):
- data = resp.content
- if data:
- document = HTML.fromstring(data, base_url = resp.geturl())
- document.make_links_absolute()
- resp.document = document
- else:
- resp.document = None
- return resp
-
-
- if __name__ == '__main__':
- pass
-
-